Open libraries
suppressMessages(library(GenomicRanges))
suppressMessages(library(dplyr))
suppressMessages(library(tidyr))
suppressMessages(library(ggplot2))
suppressMessages(library(BiocGenerics))
suppressMessages(library(knitr))
suppressMessages(library(stringr))
suppressMessages(library(ggtree))
suppressMessages(library(ape))
suppressMessages(library(seqinr))
suppressMessages(library(data.table))
suppressMessages(library(phytools))
suppressMessages(library(Rsamtools))
suppressMessages(library(MASS))
suppressMessages(library(lme4))
suppressMessages(library(stats))
suppressMessages(library(RColorBrewer))
suppressMessages(library(ggpubr))
The initial analysis of single nucleotide variant and indel burdens is illustrated using the KX001 (29 year male) dataset. The input files for this part of the analysis are available on Mendeley Data (data post removal of samples that failed QC): 1) mats_and_params_KX001_4_01 2) tree_KX001_4_01_standard_rho01.tree 3) annotated_mut_set_KX001_4_01__standard_rho01 4) KX001_4_sensitivity Note these input files are also available for all the individuals analysed.
ID = "KX001" #Edit
Iteration = "KX001_4" #Edit
Run_ID = "KX001_4_01" #Edit
filtering_ID = "standard_rho01" #Edit
lim_snv = 1000 #Edit- used for ylim_snv on graphs of mutation burden
br_snv = c(200,400,600,800,1000)
lim_indel = 50 #Edit- used for ylim_snv on graphs of mutation burden
br_indel = c(10,20,30,40,50)
setwd = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/") #Ensure this directory exists and copy mats_and_params file there
mats_and_param_file = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/mats_and_params_", Run_ID)
tree_file_path = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/tree_", Run_ID,"_",filtering_ID, ".tree")
file_annot = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/annotated_mut_set_", Run_ID,"_",filtering_ID)
sensitivity_df = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/",Iteration,"_sensitivity")
function_files=list.files('~/Documents/Bioinformatics/CGP/Functions/', full.names = TRUE, pattern = ".R")
suppressMessages(sapply(function_files, source))
## /Users/em16/Documents/Bioinformatics/CGP/Functions//filters_parallel_functions.R
## value ?
## visible FALSE
## /Users/em16/Documents/Bioinformatics/CGP/Functions//targeted_analysis_functions.R
## value ?
## visible FALSE
## /Users/em16/Documents/Bioinformatics/CGP/Functions//tree_functions.R
## value ?
## visible FALSE
load(file_annot)
load(mats_and_param_file)
tree <- read.tree(tree_file_path)
sensitivity <- read.table(sensitivity_df, stringsAsFactors = F, header = T)
write.csv(filtered_muts$COMB_mats.tree.build$NR, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_NR.csv"), row.names = T, quote=F)
write.csv(filtered_muts$COMB_mats.tree.build$NV, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_NV.csv"), row.names = T, quote=F)
SNVs_per_sample = colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "SNV",] == 1)
mean(SNVs_per_sample); sd(SNVs_per_sample); range(SNVs_per_sample)
## [1] 427.4791
## [1] 65.34828
## [1] 228 632
Mutation_burden <- as.data.frame(colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "SNV",] == 1))
Mutation_burden$Mean_depth <- (colSums(filtered_muts$COMB_mats.tree.build$NR)/nrow(filtered_muts$COMB_mats.tree.build$NR))
colnames(Mutation_burden)[1] <- "Number_mutations"
Mutation_burden$Sample <- rownames(Mutation_burden)
Summary_snvs <- as.data.frame(cbind(Mutation_burden[,c(3,1,2)]))
INDELs_per_sample = colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "INDEL",] == 1)
mean(INDELs_per_sample); sd(INDELs_per_sample); range(INDELs_per_sample)
## [1] 10.54545
## [1] 4.867938
## [1] 1 29
Indel_burden <- as.data.frame(colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "INDEL",] == 1))
Indel_burden$Mean_depth <- (colSums(filtered_muts$COMB_mats.tree.build$NR)/nrow(filtered_muts$COMB_mats.tree.build$NR))
colnames(Indel_burden)[1] <- "Number_indels"
Indel_burden$Sample <- rownames(Indel_burden)
Summary_indels <- as.data.frame(cbind(Indel_burden[,c(3,1,2)]))
Summary_snvs: data.frame with one row per sample and the following columns:
Mean_depth: the average coverage per sample
Number_mutations: the number of mutations per sample
### Asymptotic model
# Sort data
sorted = sortedXyData(Summary_snvs$Mean_depth, Summary_snvs$Number_mutations)
# Run model
model.as = NLSstAsymptotic(sorted)
# Note: you will be alerted if the model does not converge (typically if data are not asymptotic or there is too little data)
# Look at results
model.as
## b0 b1 lrc
## -475.427582 1048.466358 -1.869659
mymodel = function(x){
b0 = model.as[1]
b1 = model.as[2]
lrc = model.as[3]
b0 + b1*(1-exp(-exp(lrc) * x))
}
myrange=1:round(max(Summary_snvs$Mean_depth))
asp = data.frame(x=myrange, y=unlist(lapply(myrange, FUN=mymodel)))
suppressWarnings(ggplot(Summary_snvs) +
ylim(1,lim_snv)+
xlim(0,50)+
geom_point(aes(x=Mean_depth,y=Number_mutations), data=Summary_snvs, pch=21) +
theme_bw() +
scale_fill_brewer(type="qual", palette=3) +
ylab("# Mutations") +
xlab("Coverage per colony")+
geom_line(aes(x=x, y=y), data=asp)+
labs(title = "SNV burden with asymptotic regression line"))
## Warning: Removed 3 row(s) containing missing values (geom_path).
new_Number_mutations = function(rd, Number_mutations){
Number_mutations + ( mymodel(30) - mymodel(rd)) ## can change depth here
}
Summary_snvs$Number_mutations_adj_as = apply(Summary_snvs, MARGIN=1, FUN=function(X) new_Number_mutations(rd=as.numeric(X[which(colnames(Summary_snvs)=="Mean_depth")]), Number_mutations=as.numeric(X[which(colnames(Summary_snvs)=="Number_mutations")]) ) )
ggplot(Summary_snvs) +
ylim(0,lim_snv)+
xlim(0,50)+
theme_bw() +
geom_point(aes(Mean_depth,Number_mutations_adj_as), pch=21) +
ylab("# Mutations") +
xlab("Coverage per colony")+
labs(title ="SNV burden corrected for depth of sequencing")
ggplot(Summary_snvs) +
ylim(0,lim_snv)+
xlim(0,50)+
theme_bw() +
geom_smooth(aes(Mean_depth,Number_mutations_adj_as), pch=21) +
ylab("# Mutations") +
xlab("Coverage per colony")+
labs(title ="SNV burden corrected for depth of sequencing")
## Warning: Ignoring unknown parameters: shape
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
write.table(Summary_snvs, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_sub_dep_adj.txt"), sep="\t", col.names = T, row.names = F, quote=F)
### Asymptotic model
# Sort data
sorted = sortedXyData(Summary_indels$Mean_depth, Summary_indels$Number_indels)
# Run model
model.as = NLSstAsymptotic(sorted)
# Note: you will be alerted if the model does not converge (typically if data are not asymptotic or there is too little data)
# If data are more linear then a linear regression is reasonable.
# Look at results
model.as
## b0 b1 lrc
## -21.30056 47.55293 -2.46786
mymodel = function(x){
b0 = model.as[1]
b1 = model.as[2]
lrc = model.as[3]
b0 + b1*(1-exp(-exp(lrc) * x))
}
myrange=1:round(max(Summary_indels$Mean_depth))
asp = data.frame(x=myrange, y=unlist(lapply(myrange, FUN=mymodel)))
suppressWarnings(ggplot(Summary_indels) +
ylim(1,lim_indel)+
xlim(0,50)+
geom_point(aes(x=Mean_depth,y=Number_indels), data=Summary_indels, pch=21) +
theme_bw() +
scale_fill_brewer(type="qual", palette=3) +
ylab("# Indels") +
xlab("Coverage per colony")+
geom_line(aes(x=x, y=y), data=asp)+
labs(title ="Indel burden with asymptotic regression line"))
## Warning: Removed 7 row(s) containing missing values (geom_path).
new_Number_indels = function(rd, Number_indels){
Number_indels + ( mymodel(30) - mymodel(rd)) ## can change depth here
}
Summary_indels$Number_indels_adj_as = apply(Summary_indels, MARGIN=1, FUN=function(X) new_Number_indels(rd=as.numeric(X[which(colnames(Summary_indels)=="Mean_depth")]), Number_indels=as.numeric(X[which(colnames(Summary_indels)=="Number_indels")]) ) )
ggplot(Summary_indels) +
ylim(0,lim_indel)+
xlim(0,50)+
theme_bw() +
geom_point(aes(Mean_depth,Number_indels_adj_as), pch=21) +
ylab("# Indels") +
xlab("Coverage per colony")+
labs(title ="Indel burden corrected for depth of sequencing")
ggplot(Summary_indels) +
ylim(0,lim_indel)+
xlim(0,50)+
theme_bw() +
geom_smooth(aes(Mean_depth,Number_indels_adj_as), pch=21) +
ylab("# Mutations") +
xlab("Coverage per colony")+
labs(title ="Indel burden corrected for depth of sequencing")
## Warning: Ignoring unknown parameters: shape
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
write.table(Summary_indels, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_indel_dep_adj.txt"), sep="\t", col.names = T, row.names = F, quote=F)
dnds_input <- filtered_muts$COMB_mats.tree.build$mat[,2:5]
dnds_input$SampleID <- ID
colnames(dnds_input) <- c("Chr","Pos","Ref","Alt","SampleID")
dnds_input <- dnds_input[,c("SampleID","Chr","Pos","Ref","Alt")]
write.table(dnds_input, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/dnds/",Iteration, "_dnds_all.txt"), sep="\t", col.names = T, row.names = F, quote=F)
create_dnds_files = function(mat, select_vector = NULL) {
if(is.null(select_vector)) {dnds_file = mat[,2:5]} else {dnds_file = mat[select_vector,2:5]}
dnds_file$SampleID <- ID
colnames(dnds_file) <- c("Chr","Pos","Ref","Alt","SampleID")
dnds_file <- dnds_file[,c("SampleID","Chr","Pos","Ref","Alt")]
return(dnds_file)
}
shared_dnds_file = create_dnds_files(filtered_muts$COMB_mats.tree.build$mat, select_vector = filtered_muts$COMB_mats.tree.build$mat$mut_ref %in% rownames(filtered_muts$Genotype_shared_bin))
private_dnds_file = create_dnds_files(filtered_muts$COMB_mats.tree.build$mat, select_vector = !filtered_muts$COMB_mats.tree.build$mat$mut_ref %in% rownames(filtered_muts$Genotype_shared_bin))
write.table(shared_dnds_file, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/dnds/",Iteration, "_dnds_shared.txt"), sep="\t", col.names = T, row.names = F, quote=F)
write.table(private_dnds_file, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/dnds/",Iteration, "_dnds_private.txt"), sep="\t", col.names = T, row.names = F, quote=F)
new_genotype_mat <- filtered_muts$COMB_mats.tree.build$Genotype_bin
#Set all the genotypes to 0 as a baseline
new_genotype_mat[,] <- 0
details <- filtered_muts$COMB_mats.tree.build$mat
#Now go through each of the mutations in turn and replace the 0's with 1's
for(i in tree$edge[,2]) {
node=i
info=get_edge_info(tree,details,node)
samples=info$samples
muts=details$mut_ref[info$idx]
new_genotype_mat[muts,samples] <- 1
}
write.csv(new_genotype_mat, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration, "_mutation_calls_tree.csv"),row.names = T, quote=F)
new_genotype_mat <- cbind(filtered_muts$COMB_mats.tree.build$mat[,2:5],new_genotype_mat[,])
samples = colnames(new_genotype_mat[5:ncol(new_genotype_mat)])
for (sample in samples) {
#Write vcf files for VariantCaller analysis - separate out "All mutations", "Shared mutations", "Private mutations".
vcf_file = new_genotype_mat[(new_genotype_mat[,sample] >0),c(1:4)]
names(vcf_file) = c("#CHROM", "POS", "REF", "ALT")
#Add the required null columns
vcf_file$ID = vcf_file$QUAL = vcf_file$FILTER = vcf_file$INFO = "."
#Rearrange into correct order
vcf_file = vcf_file[,c(1,2,5,3,4,6,7,8)]
#Write files
write.table(vcf_file, sep = "\t", quote = FALSE, file = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/hdp/",sample,"_mutations_all.vcf"), row.names = FALSE)
}
all_muts <- filtered_muts$COMB_mats.tree.build$mat[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "SNV",]
all_muts <- all_muts[,c(2:5,7)]
all_muts$SampleID <- paste0(ID, "_", all_muts$node)
all_muts <- all_muts[,c(1:4,6)]
colnames(all_muts) <- c("Chr","Pos","Ref","Alt","SampleID")
genomeFile = "/Users/em16/Documents/Bioinformatics/CGP/Reference_genomes/hs37d5.fa"
subs_only = all_muts
nrow(subs_only)
## [1] 160216
colnames(subs_only) = c("chr", "pos", "ref", "mut")
subs_only = subs_only[(subs_only$ref %in% c("A","C","G","T")) & (subs_only$mut %in% c("A","C","G","T")) & subs_only$chr %in% c(1:22,"X","Y"),]
subs_only$trinuc_ref = as.vector(scanFa(genomeFile, GRanges(subs_only$chr, IRanges(subs_only$pos-1, subs_only$pos+1))))
# 2. Annotating the mutation from the pyrimidine base
ntcomp = c(T="A",G="C",C="G",A="T")
subs_only$sub = paste(subs_only$ref,subs_only$mut,sep=">")
subs_only$trinuc_ref_py = subs_only$trinuc_ref
for (j in 1:nrow(subs_only)) {
if (subs_only$ref[j] %in% c("A","G")) { # Purine base
subs_only$sub[j] = paste(ntcomp[subs_only$ref[j]],ntcomp[subs_only$mut[j]],sep=">")
subs_only$trinuc_ref_py[j] = paste(ntcomp[rev(strsplit(subs_only$trinuc_ref[j],split="")[[1]])],collapse="")
}
}
# 3. Counting subs
freqs = table(paste(subs_only$sub,paste(substr(subs_only$trinuc_ref_py,1,1),substr(subs_only$trinuc_ref_py,3,3),sep="-"),sep=","))
sub_vec = c("C>A","C>G","C>T","T>A","T>C","T>G")
ctx_vec = paste(rep(c("A","C","G","T"),each=4),rep(c("A","C","G","T"),times=4),sep="-")
full_vec = paste(rep(sub_vec,each=16),rep(ctx_vec,times=6),sep=",")
freqs_full = freqs[full_vec]
freqs_full[is.na(freqs_full)] = 0
names(freqs_full) = full_vec
xstr = paste(substr(full_vec,5,5), substr(full_vec,1,1), substr(full_vec,7,7), sep="")
colvec = rep(c("dodgerblue","black","red","grey70","olivedrab3","plum2"),each=16)
y = freqs_full
maxy = max(y)
h=barplot(y, las=2, col=colvec, border=NA, ylim_snv=c(0,maxy*1.5), space=1, cex.names=0.6, names.arg=xstr, ylab="# SNVs")
## Warning in plot.window(xlim, ylim, log = log, ...): "ylim_snv" is not a
## graphical parameter
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "ylim_snv" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "ylim_snv" is not a graphical parameter
## Warning in axis(if (horiz) 1 else 2, cex.axis = cex.axis, ...): "ylim_snv" is
## not a graphical parameter
mtext(side=4, text= ID)
for (j in 1:length(sub_vec)) {
xpos = h[c((j-1)*16+1,j*16)]
rect(xpos[1]-0.5, maxy*1.2, xpos[2]+0.5, maxy*1.3, border=NA, col=colvec[j*16])
text(x=mean(xpos), y=maxy*1.3, pos=3, label=sub_vec[j])
}
This analysis is performed using a matrix of mutation burdens, indel burdens and telomere lengths per sample - available in mutation_telomere_analysis/input/Summary_cut.csv. The .csv file containing information on colony efficiency is available in mutation_telomere_analysis/input/Colony_efficiency_cut.csv)
col_efficiency <- read.csv("~/Documents/PhD/Lab_work/Colony_efficiency/Colony_efficiency_cut.csv", stringsAsFactors = F)
summ_cut <- read.csv("~/Documents/PhD/Sequencing_results/DNA_seq/XX_summary/telomere_mutation/data/Summary_cut.csv", stringsAsFactors = F)
summ_cut$donor_id <- factor(summ_cut$donor_id, levels = c("CB001", "CB002", "KX001", "KX002","SX001","AX001", "KX007","KX008","KX004","KX003"))
col_efficiency$donor_id <- factor(col_efficiency$donor_id, levels = c("CB001", "CB002", "KX001", "KX002","SX001","AX001", "KX007","KX008","KX004","KX003"))
ggplot(col_efficiency)+
theme_bw()+
scale_fill_manual(values = brewer.pal(10,"Paired")[c(1:4,7,8,5,6,9,10)])+
labs(y="Colony efficiency", x="Donor")+
theme(text=element_text(size=12))+
ylim(0,1.0)+
geom_col(aes(group = donor_id, y= efficiency, x = donor_id, fill = donor_id), width = 0.9)+
labs(title = "Colony Efficiency")
ggplot(summ_cut[summ_cut$platform == "hiseq" & summ_cut$cell_type == "HSC",])+
theme_bw()+
labs(x="Age (years)", y="Telomere lenght (bp)")+
xlim(-10,100)+
ylim(0,25000)+
theme(text=element_text(size=12))+
guides(fill="none")+
scale_color_manual(values = brewer.pal(10,"Paired")[c(1,3,4,7,8,9,10)])+
geom_jitter(aes(group = donor_id, x = age, y =tel_length, col = donor_id), width = 1)+
geom_boxplot(aes(group = donor_id, x = age, y =tel_length), outlier.alpha = 0 )+
labs(title = "Telomere length - all data included")
ggplot(summ_cut[summ_cut$platform == "hiseq" & summ_cut$cell_type == "HSC",])+
theme_bw()+
labs(x="Age (years)", y="Telomere length (bp)")+
xlim(-10,100)+
ylim(0,15000)+
theme(text=element_text(size=12))+
guides(fill="none")+
scale_color_manual(values = brewer.pal(10,"Paired")[c(1,3,4,7,8,9,10)])+
geom_jitter(aes(group = donor_id, x = age, y =tel_length, col = donor_id), width = 1)+
geom_boxplot(aes(group = donor_id, x = age, y =tel_length), outlier.alpha = 0 )+
labs(title = "Telomere length - y-axis truncated at 15000bp")
ggplot(summ_cut[summ_cut$cell_type == "HSC",])+
theme_bw()+
labs(x="Age (years)", y="Number SNVs")+
xlim(-10,100)+
ylim(0,2000)+
theme(text=element_text(size=12))+
guides(fill="none")+
scale_color_manual(values = brewer.pal(10,"Paired")[c(1:4,7,8,5,6,9,10)])+
geom_smooth(data = (summ_cut[summ_cut$cell_type == "HSC",]), method = "lm",aes(x = age, y =sub_adj), size = 0.5, colour = "grey", se = T)+
geom_jitter(aes(group = donor_id, x = age, y =sub_adj, col = donor_id))+
labs(title ="Number of single nucleotide variants")
## `geom_smooth()` using formula 'y ~ x'
ggplot(summ_cut[summ_cut$cell_type == "HSC",])+
theme_bw()+
labs(x="Age (years)", y="Number indels")+
xlim(-10,100)+
ylim(0,100)+
theme(text=element_text(size=12))+
geom_smooth(data = (summ_cut[summ_cut$cell_type == "HSC",]), method = "lm",aes(x = age, y =indel_adj), size = 0.5, colour = "grey", se = T)+
guides(fill="none")+
scale_color_manual(values = brewer.pal(10,"Paired")[c(1:4,7,8,5,6,9,10)])+
geom_jitter(aes(group = donor_id, x = age, y =indel_adj, col = donor_id))+
labs("Number of indels")
## `geom_smooth()` using formula 'y ~ x'
ggplot(subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")))+
theme_bw()+
labs(x="Age (years)", y="Number indels")+
xlim(-10,100)+
ylim(0,100)+
theme(text=element_text(size=12))+
geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "HSC")), method = "lm",aes(x = age, y =indel_adj), size = 0.5, colour = "red", se = T)+
geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "Progenitor")), method = "lm",aes(x = age, y =indel_adj), size = 0.5, colour = "blue", se = T)+
labs(title = "Number of indels in progenitors (blue) and HSC/MPPs (red)")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggplot(subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")))+
theme_bw()+
labs(x="Age (years)", y="Number SNVs")+
xlim(-10,100)+
ylim(0,2000)+
theme(text=element_text(size=12))+
geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "HSC")), method = "lm",aes(x = age, y =sub_adj), size = 0.5, colour = "red", se = T)+
geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "Progenitor")), method = "lm",aes(x = age, y =sub_adj), size = 0.5, colour = "blue", se = T)+
labs(title = "Number of SNVs in progenitors (blue) and HSC/MPPs (red)")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
This analysis is performed using data files available in the mutation_telomere_analysis/input/ folder.
SV_summ <- read.csv("~/Documents/PhD/Sequencing_results/DNA_seq/XX_summary/telomere_mutation/data/SV_summ.csv", stringsAsFactors = F, row.names = 1)
CN_summ <- read.csv("~/Documents/PhD/Sequencing_results/DNA_seq/XX_summary/telomere_mutation/data/CN_summ.csv", stringsAsFactors = F, row.names = 1)
SV_summ <- as.matrix(SV_summ[6:9,],)
CN_summ <- as.matrix(CN_summ[6:9,])
barplot(SV_summ, beside = FALSE, names.arg = c("CB001","CB002","KX001","KX002","SX001","AX001","KX007","KX008","KX004","KX003"), col = brewer.pal(10,"PRGn")[c(2,4,8,10)], ylim = c(0,0.04), cex.names = 0.5, legend.text = T,main = "Number of structural variant events \nexpressed as a fraction of total sample number")
These analyses are performed using the Summary_cut.csv file loaded earlier as the summ_cut object.
mean(summ_cut$mean_depth)
## [1] 14.13358
mean_tel_CB001 <- mean(summ_cut$tel_length[summ_cut$donor_id == "CB001"])
mean_tel_CB001
## [1] 6946.516
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
##
## Shapiro-Wilk normality test
##
## data: summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.79028, p-value = 1.058e-15
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
##
## Shapiro-Wilk normality test
##
## data: summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.97028, p-value = 2.297e-07
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
##
## Shapiro-Wilk normality test
##
## data: summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.86813, p-value < 2.2e-16
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
##
## Shapiro-Wilk normality test
##
## data: summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.91815, p-value = 1.847e-08
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
##
## Shapiro-Wilk normality test
##
## data: summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.95725, p-value = 3.08e-05
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
##
## Shapiro-Wilk normality test
##
## data: summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.93659, p-value = 0.0009823
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
##
## Shapiro-Wilk normality test
##
## data: summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.97375, p-value = 0.08012
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "CB001 Q-Q plot for telomere length")
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX001 Q-Q plot for telomere length")
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX002 Q-Q plot for telomere length")
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "SX001 Q-Q plot for telomere length")
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "AX001 Q-Q plot for telomere length")
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX003 Q-Q plot for telomere length")
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "CB001", xlab = "Telomere length")
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX001", xlab = "Telomere length")
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX002", xlab = "Telomere length")
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "SX001", xlab = "Telomere length")
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "AX001", xlab = "Telomere length")
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX003", xlab = "Telomere length")
Outliers determined by interquartile range (IQR) criterion as used by the function boxplot
CB001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "CB001", ]
CB001_outliers <- boxplot.stats(CB001_norm$tel_length)$out
CB001 <- (length(CB001_outliers)/length(CB001_norm$tel_length))*100
CB001
## [1] 6.965174
KX001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX001", ]
KX001_outliers <- boxplot.stats(KX001_norm$tel_length)$out
KX001 <- (length(KX001_outliers)/length(KX001_norm$tel_length))*100
KX001
## [1] 2.457002
KX002_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX002", ]
KX002_outliers <- boxplot.stats(KX002_norm$tel_length)$out
KX002 <- (length(KX002_outliers)/length(KX002_norm$tel_length))*100
KX002
## [1] 5.263158
SX001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "SX001", ]
SX001_outliers <- boxplot.stats(SX001_norm$tel_length)$out
SX001 <- (length(SX001_outliers)/length(SX001_norm$tel_length))*100
SX001
## [1] 0.5586592
AX001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "AX001", ]
AX001_outliers <- boxplot.stats(AX001_norm$tel_length)$out
AX001 <- (length(AX001_outliers)/length(AX001_norm$tel_length))*100
AX001
## [1] 1.685393
KX004_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX004", ]
KX004_outliers <- boxplot.stats(KX004_norm$tel_length)$out
KX004 <- (length(KX004_outliers)/length(KX004_norm$tel_length))*100
KX004
## [1] 0
KX003_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX003", ]
KX003_outliers <- boxplot.stats(KX003_norm$tel_length)$out
KX003 <- (length(KX003_outliers)/length(KX003_norm$tel_length))*100
KX003
## [1] 1.176471
all_out <- rbind(CB001, KX001, KX002, SX001, AX001, KX004, KX003)
age <- c(0,29,38,48,63,77,81)
all_out <- cbind(all_out, age)
colnames(all_out) <- c("outliers", "age")
all_out <- as.data.frame(all_out)
ggplot(data= all_out)+
theme_bw()+
labs(x="Age (years)", y="Percentage HSC/MPPs with outlying telomere lengths")+
xlim(-10,100)+
scale_y_continuous(breaks = c(0,2,4,6,8))+
theme(text=element_text(size=12))+
geom_point(data = all_out, aes(x = age, y = all_out$outliers))+
geom_smooth(data = all_out, method = lm, aes(x = age, y = outliers), se = FALSE)
## Warning: Use of `all_out$outliers` is discouraged. Use `outliers` instead.
## `geom_smooth()` using formula 'y ~ x'
outliers <- all_out$outliers
lm_out <- lm(outliers ~ age)
summary(lm_out)
##
## Call:
## lm(formula = outliers ~ age)
##
## Residuals:
## 1 2 3 4 5 6 7
## 0.7734 -1.5566 1.9255 -2.0279 0.2255 -0.4084 1.0685
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.19181 1.22117 5.070 0.00387 **
## age -0.07511 0.02227 -3.373 0.01983 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.562 on 5 degrees of freedom
## Multiple R-squared: 0.6947, Adjusted R-squared: 0.6336
## F-statistic: 11.37 on 1 and 5 DF, p-value: 0.01983
These analyses are performed using the Summary_cut.csv file loaded earlier as the summ_cut object.
age.mut <- lmer(sub_adj ~ age + (age | donor_id), data = summ_cut[summ_cut$cell_type == "HSC",], REML = F)
## boundary (singular) fit: see ?isSingular
age.mut
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: sub_adj ~ age + (age | donor_id)
## Data: summ_cut[summ_cut$cell_type == "HSC", ]
## AIC BIC logLik deviance df.resid
## 37678.92 37715.67 -18833.46 37666.92 3368
## Random effects:
## Groups Name Std.Dev. Corr
## donor_id (Intercept) 1.0661
## age 0.3811 1.00
## Residual 63.9693
## Number of obs: 3374, groups: donor_id, 10
## Fixed Effects:
## (Intercept) age
## 54.57 16.83
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.mut)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: sub_adj ~ age + (age | donor_id)
## Data: summ_cut[summ_cut$cell_type == "HSC", ]
##
## AIC BIC logLik deviance df.resid
## 37678.9 37715.7 -18833.5 37666.9 3368
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4623 -0.5092 -0.0154 0.4903 4.1832
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## donor_id (Intercept) 1.1367 1.0661
## age 0.1452 0.3811 1.00
## Residual 4092.0705 63.9693
## Number of obs: 3374, groups: donor_id, 10
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 54.5650 2.8988 18.82
## age 16.8318 0.1529 110.06
##
## Correlation of Fixed Effects:
## (Intr)
## age -0.346
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
age.indel <- lmer(indel_adj ~ age + (age | donor_id), data = summ_cut[summ_cut$cell_type == "HSC",], REML = F)
## boundary (singular) fit: see ?isSingular
age.indel
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: indel_adj ~ age + (age | donor_id)
## Data: summ_cut[summ_cut$cell_type == "HSC", ]
## AIC BIC logLik deviance df.resid
## 15767.391 15804.135 -7877.696 15755.391 3368
## Random effects:
## Groups Name Std.Dev. Corr
## donor_id (Intercept) 0.54908
## age 0.01895 -1.00
## Residual 2.48903
## Number of obs: 3374, groups: donor_id, 10
## Fixed Effects:
## (Intercept) age
## -0.3301 0.1195
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.indel)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: indel_adj ~ age + (age | donor_id)
## Data: summ_cut[summ_cut$cell_type == "HSC", ]
##
## AIC BIC logLik deviance df.resid
## 15767.4 15804.1 -7877.7 15755.4 3368
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9666 -0.5561 -0.1074 0.5364 4.1523
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## donor_id (Intercept) 0.3014859 0.54908
## age 0.0003592 0.01895 -1.00
## Residual 6.1952505 2.48903
## Number of obs: 3374, groups: donor_id, 10
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.330075 0.248525 -1.328
## age 0.119526 0.006897 17.330
##
## Correlation of Fixed Effects:
## (Intr)
## age -0.922
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
age.tel <- lmer(tel_length ~ age + (age | donor_id), data = subset(summ_cut, summ_cut$platform == "hiseq" & !summ_cut$donor_id %in% c("CB001", "KX003") & summ_cut$cell_type == "HSC"), REML = F)
age.tel
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: tel_length ~ age + (age | donor_id)
## Data:
## subset(summ_cut, summ_cut$platform == "hiseq" & !summ_cut$donor_id %in%
## c("CB001", "KX003") & summ_cut$cell_type == "HSC")
## AIC BIC logLik deviance df.resid
## 19159.945 19190.580 -9573.973 19147.945 1213
## Random effects:
## Groups Name Std.Dev. Corr
## donor_id (Intercept) 628.2
## age 21.4 -1.00
## Residual 618.5
## Number of obs: 1219, groups: donor_id, 5
## Fixed Effects:
## (Intercept) age
## 4543.20 -31.87
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.tel)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: tel_length ~ age + (age | donor_id)
## Data:
## subset(summ_cut, summ_cut$platform == "hiseq" & !summ_cut$donor_id %in%
## c("CB001", "KX003") & summ_cut$cell_type == "HSC")
##
## AIC BIC logLik deviance df.resid
## 19159.9 19190.6 -9574.0 19147.9 1213
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.4215 -0.6373 -0.1048 0.4944 7.2819
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## donor_id (Intercept) 394629.0 628.2
## age 457.9 21.4 -1.00
## Residual 382564.9 618.5
## Number of obs: 1219, groups: donor_id, 5
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 4543.20 316.74 14.344
## age -31.87 10.70 -2.979
##
## Correlation of Fixed Effects:
## (Intr)
## age -0.995
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
age.cell.type <- lmer(sub_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id), data = subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")), REML = F)
age.cell.type
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: sub_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id)
## Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",
## "KX004"))
## AIC BIC logLik deviance df.resid
## 17394.786 17448.336 -8687.393 17374.786 1554
## Random effects:
## Groups Name Std.Dev. Corr
## donor_id (Intercept) 6.14923
## age 0.08256 1.00
## donor_id.1 (Intercept) 0.45273
## cell_typeProgenitor 24.75478 0.06
## Residual 62.19801
## Number of obs: 1564, groups: donor_id, 4
## Fixed Effects:
## (Intercept) age cell_typeProgenitor
## 49.76 16.64 28.72
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.cell.type)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: sub_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id)
## Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",
## "KX004"))
##
## AIC BIC logLik deviance df.resid
## 17394.8 17448.3 -8687.4 17374.8 1554
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8958 -0.4078 0.0095 0.5054 3.2297
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## donor_id (Intercept) 3.781e+01 6.14923
## age 6.817e-03 0.08256 1.00
## donor_id.1 (Intercept) 2.050e-01 0.45273
## cell_typeProgenitor 6.128e+02 24.75478 0.06
## Residual 3.869e+03 62.19801
## Number of obs: 1564, groups: donor_id, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 49.7630 6.9987 7.11
## age 16.6434 0.1555 107.00
## cell_typeProgenitor 28.7219 14.0138 2.05
##
## Correlation of Fixed Effects:
## (Intr) age
## age -0.713
## cll_typPrgn -0.036 0.008
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
age.cell.type.indel <- lmer(indel_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id), data = subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")), REML = F)
## boundary (singular) fit: see ?isSingular
age.cell.type.indel
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: indel_adj ~ age + cell_type + (age | donor_id) + (cell_type |
## donor_id)
## Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",
## "KX004"))
## AIC BIC logLik deviance df.resid
## 7310.441 7363.991 -3645.220 7290.441 1554
## Random effects:
## Groups Name Std.Dev. Corr
## donor_id (Intercept) 0.241736
## age 0.008164 -1.00
## donor_id.1 (Intercept) 0.255541
## cell_typeProgenitor 0.025839 1.00
## Residual 2.481492
## Number of obs: 1564, groups: donor_id, 4
## Fixed Effects:
## (Intercept) age cell_typeProgenitor
## 0.29289 0.11172 0.04258
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.cell.type.indel)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: indel_adj ~ age + cell_type + (age | donor_id) + (cell_type |
## donor_id)
## Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",
## "KX004"))
##
## AIC BIC logLik deviance df.resid
## 7310.4 7364.0 -3645.2 7290.4 1554
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9977 -0.5563 -0.1347 0.5276 4.1544
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## donor_id (Intercept) 5.844e-02 0.241736
## age 6.664e-05 0.008164 -1.00
## donor_id.1 (Intercept) 6.530e-02 0.255541
## cell_typeProgenitor 6.677e-04 0.025839 1.00
## Residual 6.158e+00 2.481492
## Number of obs: 1564, groups: donor_id, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.292890 0.359722 0.814
## age 0.111723 0.006943 16.092
## cell_typeProgenitor 0.042579 0.189443 0.225
##
## Correlation of Fixed Effects:
## (Intr) age
## age -0.845
## cll_typPrgn -0.070 0.036
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular